import os
import gzip
import pybedtools
from matplotlib.ticker import FormatStrFormatter
from scipy.stats import fisher_exact
from pylab import *
from numpy import *


def read_significance():
    filename = 'enhancers.deseq.txt'
    print("Reading %s" % filename)
    handle = open(filename)
    line = next(handle)
    words = line.split()
    assert words[0] == "enhancer"
    assert words[1] == "00hr_basemean"
    assert words[2] == "00hr_log2fc"
    assert words[3] == "00hr_pvalue"
    assert words[4] == "01hr_basemean"
    assert words[5] == "01hr_log2fc"
    assert words[6] == "01hr_pvalue"
    assert words[7] == "04hr_basemean"
    assert words[8] == "04hr_log2fc"
    assert words[9] == "04hr_pvalue"
    assert words[10] == "12hr_basemean"
    assert words[11] == "12hr_log2fc"
    assert words[12] == "12hr_pvalue"
    assert words[13] == "24hr_basemean"
    assert words[14] == "24hr_log2fc"
    assert words[15] == "24hr_pvalue"
    assert words[16] == "96hr_basemean"
    assert words[17] == "96hr_log2fc"
    assert words[18] == "96hr_pvalue"
    assert words[19] == "all_basemean"
    assert words[20] == "all_log2fc"
    assert words[21] == "all_pvalue"
    basemeans = {}
    ppvalues = {}
    for line in handle:
        words = line.split()
        assert len(words) == 22
        name = words[0]
        basemean = float(words[19])
        basemeans[name] = basemean
        assert basemean > 0
        log2fc = float(words[20])
        pvalue = float(words[21])
        ppvalue = -log10(pvalue) * sign(log2fc)
        ppvalues[name] = ppvalue
    handle.close()
    return ppvalues

d = {}
stream = open("overlap.txt")
for line in stream:
    words = line.split()
    d[words[7]] = words[3]
stream.close()


contingency = zeros((2,2), int)
ppvalues = read_significance()
filename = "reporters.bed"
print("Reading", filename)
alignments = pybedtools.BedTool(filename)
for alignment in alignments:
    name = alignment.name
    pvalue = float(alignment.score)
    name = d.get(name)
    if name is None:
       continue
    ppvalue = ppvalues.get(name)
    if ppvalue is None:
        continue
    elif ppvalue > 0:  # HiSeq
        i = 0
    elif ppvalue < 0:  # CAGE
        i = 1
    else:
        continue
    if pvalue < 0.05:
        j = 0
    else:
        j = 1
    contingency[i, j] += 1

significant = contingency[:, 0]
total = sum(contingency, 1)

fraction = significant / total
percentage = fraction * 100
standard_error = sqrt(sum(1.0 / contingency, 1))
yerr = fraction * standard_error

figure(figsize=(2,2.5))

bar([0, 1], 100.0 * (1-fraction), bottom = 100.0 * fraction, color=['red', 'blue'], alpha=0.1)
bar([0, 1], 100.0 * fraction, color=['red', 'blue'])

labels = xticks([0, 1], ["short\n($N = %d$)" % total[0], "long\n($N = %d$)" % total[1]], fontsize=8)

labels[1][0].set_color('red')
labels[1][1].set_color('blue')

yticks(fontsize=8)

subplots_adjust(left=0.35, right=0.85, bottom=0.3)

xlabel("Expression enrichment as\nshort or long capped RNAs", fontsize=8)
ylabel("Percentage of enhancers with\nsignificant reporter activity", fontsize=8)

print("Percentage of enhancers with significant reporter activity:")
print("Short: %.2f%%" % percentage[0])
print("Long: %.2f%%" % percentage[1])

oddsratio, pvalue = fisher_exact(contingency)
print("Fisher-exact oddsratio %.5f, p-value: %.5f" % (oddsratio, pvalue))

filename = "figure_enhancer_reporter.png"
print("Saving figure as", filename)
savefig(filename)

filename = "figure_enhancer_reporter.svg"
print("Saving figure as", filename)
savefig(filename)
